import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pycountry_convert import country_alpha2_to_country_name,country_name_to_country_alpha3
df = pd.read_csv("ds_salaries.csv", index_col=0)
df
| work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020 | MI | FT | Data Scientist | 70000 | EUR | 79833 | DE | 0 | DE | L |
| 1 | 2020 | SE | FT | Machine Learning Scientist | 260000 | USD | 260000 | JP | 0 | JP | S |
| 2 | 2020 | SE | FT | Big Data Engineer | 85000 | GBP | 109024 | GB | 50 | GB | M |
| 3 | 2020 | MI | FT | Product Data Analyst | 20000 | USD | 20000 | HN | 0 | HN | S |
| 4 | 2020 | SE | FT | Machine Learning Engineer | 150000 | USD | 150000 | US | 50 | US | L |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 602 | 2022 | SE | FT | Data Engineer | 154000 | USD | 154000 | US | 100 | US | M |
| 603 | 2022 | SE | FT | Data Engineer | 126000 | USD | 126000 | US | 100 | US | M |
| 604 | 2022 | SE | FT | Data Analyst | 129000 | USD | 129000 | US | 0 | US | M |
| 605 | 2022 | SE | FT | Data Analyst | 150000 | USD | 150000 | US | 100 | US | M |
| 606 | 2022 | MI | FT | AI Scientist | 200000 | USD | 200000 | IN | 100 | US | L |
607 rows × 11 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 607 entries, 0 to 606 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 work_year 607 non-null int64 1 experience_level 607 non-null object 2 employment_type 607 non-null object 3 job_title 607 non-null object 4 salary 607 non-null int64 5 salary_currency 607 non-null object 6 salary_in_usd 607 non-null int64 7 employee_residence 607 non-null object 8 remote_ratio 607 non-null int64 9 company_location 607 non-null object 10 company_size 607 non-null object dtypes: int64(4), object(7) memory usage: 56.9+ KB
sns.heatmap(df.isnull())
<AxesSubplot: >
df.describe()
| work_year | salary | salary_in_usd | remote_ratio | |
|---|---|---|---|---|
| count | 607.000000 | 6.070000e+02 | 607.000000 | 607.00000 |
| mean | 2021.405272 | 3.240001e+05 | 112297.869852 | 70.92257 |
| std | 0.692133 | 1.544357e+06 | 70957.259411 | 40.70913 |
| min | 2020.000000 | 4.000000e+03 | 2859.000000 | 0.00000 |
| 25% | 2021.000000 | 7.000000e+04 | 62726.000000 | 50.00000 |
| 50% | 2022.000000 | 1.150000e+05 | 101570.000000 | 100.00000 |
| 75% | 2022.000000 | 1.650000e+05 | 150000.000000 | 100.00000 |
| max | 2022.000000 | 3.040000e+07 | 600000.000000 | 100.00000 |
jobTitle = df["job_title"].value_counts() <= 2
jobTitle.loc[lambda x: x == True]
Principal Data Analyst True ETL Developer True Product Data Analyst True Director of Data Engineering True Financial Data Analyst True Cloud Data Engineer True Lead Machine Learning Engineer True NLP Engineer True Head of Machine Learning True 3D Computer Vision Researcher True Data Specialist True Staff Data Scientist True Big Data Architect True Finance Data Analyst True Marketing Data Analyst True Machine Learning Manager True Data Analytics Lead True Name: job_title, dtype: bool
df.work_year.nunique()
3
plt.figure(figsize=(8,5))
px.histogram(df["work_year"], title = "Distribution of Year in the Dataset")
<Figure size 800x500 with 0 Axes>
px.bar(df.groupby("work_year")[["salary_in_usd"]].mean(), title="Average Salary (Year Wise)",
color=df.groupby("work_year")["salary_in_usd"].mean(), color_continuous_scale=px.colors.sequential.Emrld)
df.groupby("work_year")[["salary_in_usd"]].mean() / 95813.000000 * 100
# Percentage increase in the salary of Data Science candidates
| salary_in_usd | |
|---|---|
| work_year | |
| 2020 | 100.000000 |
| 2021 | 104.217374 |
| 2022 | 129.963581 |
px.bar(df.groupby("job_title")["salary_in_usd"].mean(), title="Average Salary by Job Title",
color=df.groupby("job_title")["salary_in_usd"].mean(), color_continuous_scale=px.colors.sequential.Aggrnyl_r)
px.bar(df.groupby("job_title")["salary_in_usd"].max(), orientation="h", title="Maximum Salary by Job Title")
px.bar(df.groupby("job_title")["salary_in_usd"].min(), orientation="h", title="Minimum Salary by Job Title")
df.groupby("work_year")["job_title"].value_counts()
work_year job_title
2020 Data Scientist 21
Data Engineer 11
Data Analyst 7
Machine Learning Engineer 5
Big Data Engineer 3
..
2022 ML Engineer 1
Machine Learning Infrastructure Engineer 1
NLP Engineer 1
Principal Data Analyst 1
Principal Data Scientist 1
Name: job_title, Length: 98, dtype: int64
px.histogram(df["experience_level"], title="Experience of the Individuals", color=df["experience_level"])
px.bar(df.groupby("experience_level")["salary"].mean(), title="Average Salary as per Experience",
color=df.groupby("experience_level")["salary"].mean(), color_continuous_scale=px.colors.sequential.Peach)
px.histogram(df["employment_type"], color=df["employment_type"], title="Employment Type Distribution")
px.bar(df.groupby("employment_type")["salary"].mean(), color=df.groupby("employment_type")["salary"].mean(),
color_continuous_scale=px.colors.sequential.Brwnyl, title="Average Salary based on Employment Type ")
df["company_location"].unique()
array(['DE', 'JP', 'GB', 'HN', 'US', 'HU', 'NZ', 'FR', 'IN', 'PK', 'CN',
'GR', 'AE', 'NL', 'MX', 'CA', 'AT', 'NG', 'ES', 'PT', 'DK', 'IT',
'HR', 'LU', 'PL', 'SG', 'RO', 'IQ', 'BR', 'BE', 'UA', 'IL', 'RU',
'MT', 'CL', 'IR', 'CO', 'MD', 'KE', 'SI', 'CH', 'VN', 'AS', 'TR',
'CZ', 'DZ', 'EE', 'MY', 'AU', 'IE'], dtype=object)
df["company_location"] = df["company_location"].apply(lambda x: country_name_to_country_alpha3(country_alpha2_to_country_name(x)))
df["company_location"]
0 DEU
1 JPN
2 GBR
3 HND
4 USA
...
602 USA
603 USA
604 USA
605 USA
606 USA
Name: company_location, Length: 607, dtype: object
px.histogram(df["company_location"], log_y=True,hover_name=df["company_location"], title="Count of Companies Location")
avgSal_country = df.groupby("company_location")["salary_in_usd"].mean()
px.choropleth(df, locations=avgSal_country.index,
color=avgSal_country,
color_continuous_scale=px.colors.sequential.Plasma_r, title="Average Salary based on Company Location")
df["employee_residence"] = df["employee_residence"].apply(lambda x: country_name_to_country_alpha3(country_alpha2_to_country_name(x)))
px.histogram(df["employee_residence"],log_y = True , title="Employee Resident Country Distribution")
plt.figure(figsize=(6,6))
df["company_size"].value_counts().plot(kind="pie")
plt.legend()
<matplotlib.legend.Legend at 0x1a6a6ef3940>
px.bar(df.groupby("company_size")["salary_in_usd"].mean(),
color=df.groupby("company_size")["salary_in_usd"].mean(), color_continuous_scale=px.colors.sequential.Aggrnyl_r,
title="Salary based on Company Size")
px.bar(df["salary_currency"].value_counts().head(5))
px.bar(df["remote_ratio"].value_counts() / 607 * 100)